library(caret)
library(corrplot)
library(GGally)
library(tidyverse)
library(car)
library(glmnet) # lasso regression
library(ggthemes)
library(vcd)
library(viridis)
library(rpart)
library(rpart.plot)
library(leaps) # Forward selection
library(boot) # bootstrap 
library(lmboot) # bootstrap models

MSDS 6372 Project 1

Goal

We would like to predict medical costs for insurance claims.

EDA

Things we need to do

  • Deal with missing data if there is any
  • Clean up variable names
  • change data types
  • split data into train/validation split first (then use training data for objectives)

EDA needs to include

  • discussion of trends between the response and predictors

Objective 1

We need to create a interpretable model

  • Build a model with the main goal to identify key relationships that is highly interpret able.
  • Provide interpretation of the regression coefficients
    • Include hypotheses testing
    • Interpretation of regression coefficients
    • Confidence intervals
    • Mention the practical vs statistical significance of the predictors
  • Interpretation of at least a subset of the predictors in your final model
    • Tests on coefficients
    • assumption checking interpreting those coefficients
  • Feature selection

Objective 2

Develop a model that can predict the best and do well on future data

  • Train/validation or CV approach for model comparisons
  • Create a linear regression model WITH complexity (not just include a model with predictors that you’ve eliminated from objective 1)
  • Run one non-parametric model to compare to your complex model
  • Provide measures of fit for comparisons
    • MSE
    • R squared / Ajusted R squared
    • AIC & BIC
  • Use validation set to show results DO NOT tune model based on validation set
  • Feature selection and CV are a must here

Loading the data and cleaning up all variables

# Set the seed for reproducibility
set.seed(123)

# Load data
insurance_data <- read.csv("../raw_data/insurance.csv")

# Check for missing data
sum(is.na(insurance_data)) # No missing data
## [1] 0
# Create BMI categories
# insurance_data$bmi_category <- cut(insurance_data$bmi,
#                                    breaks = c(-Inf, 18.5, 24.9, 29.9, Inf),
#                                    labels = c("Underweight", "Normal weight", "Overweight", "Obesity"))


# Create North or South variable
# insurance_data$NORS = NA
# insurance_data$NORS = ifelse(grepl("south", insurance_data$region, ignore.case = TRUE), "south", "north")

# Convert appropriate columns to factors
convert_chr_to_factor <- function(df) {
  df[] <- lapply(df, function(col) {
    if (is.character(col)) {
      return(as.factor(col))
    }
    return(col)
  })
  return(df)
}

insurance_data <- convert_chr_to_factor(insurance_data)

# Determine the size of the dataset and the training set
n <- nrow(insurance_data)
train_size <- floor(0.7 * n)

# Get random indices for the training set
train_indices <- sample(seq_len(n), size = train_size)

# Split the data into training and test sets
train_data <- insurance_data[train_indices, ]
test_data <- insurance_data[-train_indices, ]
insurance_data <- train_data # renaming as insurance_data to make code easier to understand

head(insurance_data)
##     age    sex    bmi children smoker    region   charges
## 415  19 female 35.150        0     no northwest  2134.901
## 463  62 female 38.095        2     no northeast 15230.324
## 179  46 female 28.900        2     no southwest  8823.279
## 526  18 female 33.880        0     no southeast 11482.635
## 195  18   male 34.430        0     no southeast  1137.470
## 938  39 female 24.225        5     no northwest  8965.796
str(insurance_data)
## 'data.frame':    936 obs. of  7 variables:
##  $ age     : int  19 62 46 18 18 39 41 62 20 24 ...
##  $ sex     : Factor w/ 2 levels "female","male": 1 1 1 1 2 1 1 2 2 2 ...
##  $ bmi     : num  35.1 38.1 28.9 33.9 34.4 ...
##  $ children: int  0 2 2 0 0 5 3 0 0 0 ...
##  $ smoker  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 2 ...
##  $ region  : Factor w/ 4 levels "northeast","northwest",..: 2 1 4 3 3 2 4 3 4 1 ...
##  $ charges : num  2135 15230 8823 11483 1137 ...
summary(insurance_data)
##       age            sex           bmi           children     smoker   
##  Min.   :18.00   female:448   Min.   :15.96   Min.   :0.000   no :733  
##  1st Qu.:27.00   male  :488   1st Qu.:26.48   1st Qu.:0.000   yes:203  
##  Median :39.00                Median :30.78   Median :1.000            
##  Mean   :39.26                Mean   :30.87   Mean   :1.121            
##  3rd Qu.:51.00                3rd Qu.:34.80   3rd Qu.:2.000            
##  Max.   :64.00                Max.   :53.13   Max.   :5.000            
##        region       charges     
##  northeast:228   Min.   : 1136  
##  northwest:227   1st Qu.: 4910  
##  southeast:250   Median : 9623  
##  southwest:231   Mean   :13517  
##                  3rd Qu.:17096  
##                  Max.   :63770

Preform Univariate Analysis

# Define Freedman-Diaconis bin width calculation
freedman_diaconis_bin_width <- function(data) {
  iqr_val <- IQR(data, na.rm = TRUE)
  n <- length(data)
  2 * iqr_val / (n^(1/3))
}


# Univariate Analysis
for (column_name in names(insurance_data)) {
  
  column_data <- insurance_data[[column_name]]
  
  if (is.factor(column_data)) {
    # Categorical Variable Analysis using ggplot2
    cat_plot <- ggplot(insurance_data, aes(x=column_data)) +
      geom_bar(aes(fill=column_data), color="black") +
      labs(title=paste("Distribution of", column_name), x=column_name) +
      theme_light() +
      scale_fill_brewer(palette="Set2")
    
    print(cat_plot)
    
  } else {
    
    # Continuous Variable Analysis using ggplot2
    bin_width_fd <- freedman_diaconis_bin_width(column_data)
    
    hist_plot <- ggplot(insurance_data, aes(x=column_data)) +
      geom_histogram(binwidth=bin_width_fd, fill="blue", color="black", alpha=0.7) +
      labs(title=paste("Distribution of", column_name), x=column_name) +
      theme_light()
    
    box_plot <- ggplot(insurance_data, aes(y=column_data)) +
      geom_boxplot(fill="blue", color="black", alpha=0.7) +
      labs(title=paste("Boxplot of", column_name), y=column_name) +
      theme_light()
    
    print(hist_plot)
    print(box_plot)
  }
}

Bivariate Analysis

# Identify continuous columns
continuous_columns <- names(insurance_data)[sapply(insurance_data, function(x) !is.factor(x))]

for (col1 in continuous_columns) {
  for (col2 in continuous_columns) {
    if (col1 != col2) {
      
      # Scatter plot with color based on values of col1
      scatter_plot <- ggplot(insurance_data, aes_string(x=col1, y=col2)) +
        geom_point(aes_string(color=col1), alpha=0.5) + 
        labs(title=paste("Scatter plot of", col1, "vs.", col2)) +
        theme_light() +
        scale_color_viridis_c()  # Add a color scale
      
      print(scatter_plot)
      
      # Print correlation
      correlation <- cor(insurance_data[[col1]], insurance_data[[col2]], use="complete.obs")
      cat(paste("Correlation between", col1, "and", col2, "is:", round(correlation, 2)), "\n")
      
    }
  }
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Correlation between age and bmi is: 0.09

## Correlation between age and children is: 0.03

## Correlation between age and charges is: 0.26

## Correlation between bmi and age is: 0.09

## Correlation between bmi and children is: 0

## Correlation between bmi and charges is: 0.19

## Correlation between children and age is: 0.03

## Correlation between children and bmi is: 0

## Correlation between children and charges is: 0.08

## Correlation between charges and age is: 0.26

## Correlation between charges and bmi is: 0.19

## Correlation between charges and children is: 0.08
# Identify categorical columns
categorical_columns <- names(insurance_data)[sapply(insurance_data, is.factor)]

for (col1 in continuous_columns) {
  for (col2 in categorical_columns) {
    
    # Box plot
    box_plot <- ggplot(insurance_data, aes_string(x=col2, y=col1)) +
      geom_boxplot(aes(fill=col2)) +
      labs(title=paste("Boxplot of", col1, "by", col2)) +
      theme_light() +
      scale_fill_viridis(discrete=TRUE)
    
    print(box_plot)
    
  }
}

for (col1 in categorical_columns) {
  for (col2 in categorical_columns) {
    if (col1 != col2) {
      
      # Contingency table
      contingency_table <- table(insurance_data[[col1]], insurance_data[[col2]])
      cat("\nContingency table for", col1, "vs.", col2, ":\n")
      print(contingency_table)
      
    }
  }
}
## 
## Contingency table for sex vs. smoker :
##         
##           no yes
##   female 369  79
##   male   364 124
## 
## Contingency table for sex vs. region :
##         
##          northeast northwest southeast southwest
##   female       105       117       114       112
##   male         123       110       136       119
## 
## Contingency table for smoker vs. sex :
##      
##       female male
##   no     369  364
##   yes     79  124
## 
## Contingency table for smoker vs. region :
##      
##       northeast northwest southeast southwest
##   no        176       180       189       188
##   yes        52        47        61        43
## 
## Contingency table for region vs. sex :
##            
##             female male
##   northeast    105  123
##   northwest    117  110
##   southeast    114  136
##   southwest    112  119
## 
## Contingency table for region vs. smoker :
##            
##              no yes
##   northeast 176  52
##   northwest 180  47
##   southeast 189  61
##   southwest 188  43
for (col1 in continuous_columns) {
  for (col2 in continuous_columns) {
    if (col1 != col2) {
      for (cat_col in categorical_columns) {
        
        # Scatter plot with color based on categorical column
        scatter_plot <- ggplot(insurance_data, aes_string(x=col1, y=col2)) +
          geom_point(aes_string(color=cat_col), alpha=0.5) + 
          labs(title=paste("Scatter plot of", col1, "vs.", col2, "colored by", cat_col)) +
          theme_light() +
          scale_color_brewer(palette="Set1")
        
        print(scatter_plot)
      }
    }
  }
}

plot <- ggplot(insurance_data, aes(x=age, y=charges, color=smoker)) +
  geom_point(alpha=0.5) +
  geom_smooth(method="lm", se=FALSE) +  # Linear regression per group
  labs(title="Relationship between Age and Charges by Smoking Status") +
  theme_light() +
  scale_color_manual(values=c("red", "blue"))

print(plot)
## `geom_smooth()` using formula = 'y ~ x'

categorical_columns <- names(insurance_data)[sapply(insurance_data, is.factor) & names(insurance_data) != "smoker"]

for (cat_col in categorical_columns) {
  plot <- ggplot(insurance_data, aes(x=age, y=charges, color=insurance_data[[cat_col]])) +
    geom_point(alpha=0.5) +
    geom_smooth(method="lm", se=FALSE) +
    labs(title=paste("Relationship between Age and Charges by", cat_col)) +
    theme_light()
  print(plot)
}
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

# Fit the decision tree model
fit <- rpart(charges ~ ., data=insurance_data, method="anova")

# Plot the decision tree
rpart.plot(fit, yesno=2, type=3, box.palette="RdBu", fallen.leaves=TRUE)

# create group variable based off decision tree
insurance_data$group <- with(insurance_data, ifelse(smoker == "no" & age < 43, "non-smoker & age<43",
                                   ifelse(smoker == "no" & age >= 43, "non-smoker & age>=43",
                                          ifelse(smoker == "yes" & bmi < 30, "smoker & bmi<30", 
                                                 "smoker & bmi>=30"))))
insurance_data$group2 <- with(insurance_data, ifelse(smoker == "no" & bmi < 30, "non-smoker & bmi<30",
                                   ifelse(smoker == "no" & bmi >= 30, "non-smoker & bmi>=30",
                                          ifelse(smoker == "yes" & bmi < 30, "smoker & bmi<30", 
                                                 "smoker & bmi>=30"))))
insurance_data$group3 <- with(insurance_data, ifelse(smoker == "no" & age < 43 & children == 0, "non-smoker & age<43 & 0children",
                                   ifelse(smoker == "no" & age < 43 & children != 0, "non-smoker & age<43 & 0children",
                                   ifelse(smoker == "no" & age >= 43, "non-smoker & age>=43",
                                          ifelse(smoker == "yes" & bmi < 30, "smoker & bmi<30", 
                                                 "smoker & bmi>=30")))))


insurance_data$group <- as.factor(insurance_data$group)
insurance_data$group2 <- as.factor(insurance_data$group2)
insurance_data$group3 <- as.factor(insurance_data$group3)

# Identify potential categorical columns for faceting
faceting_columns <- names(insurance_data)[sapply(insurance_data, function(x) is.factor(x))]

for (facet_var in faceting_columns) {
  
  plot <- ggplot(insurance_data, aes(x=age, y=charges, color=smoker)) +
    geom_point(alpha=0.6) +
    facet_wrap(as.formula(paste("~", facet_var))) +
    labs(title=paste("Age vs. Charges by Smoker Status, Faceted by", facet_var)) +
    theme_light()
  
  print(plot)
  
}

p <- ggplot(insurance_data, aes(x=age, y=charges, color=smoker)) +
  geom_point(alpha=0.6) +
  facet_wrap(~ group, scales="free_y") +  # Facet by the new group
  labs(title="Scatterplot of Age vs. Charges by Groups based on Tree Splits") +
  theme_light()

print(p)

# Fit the decision tree model
fit <- rpart(charges ~ ., data=insurance_data, method="anova", control=rpart.control(cp=0.001, maxdepth=3))

# Plot the decision tree
rpart.plot(fit, yesno=2, type=3, box.palette="RdBu", fallen.leaves=TRUE)

categorical_columns <- names(insurance_data)[sapply(insurance_data, is.factor) & names(insurance_data) != "smoker"]

for (cat_col in categorical_columns) {
  plot <- ggplot(insurance_data, aes(x=age, y=charges, color=insurance_data[[cat_col]])) +
    geom_point(alpha=0.5) +
    geom_smooth(method="lm", se=FALSE) +
    labs(title=paste("Relationship between Age and Charges by", cat_col)) +
    theme_light()
  print(plot)
}
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

Linear explainable model

head(insurance_data)
##     age    sex    bmi children smoker    region   charges                group
## 415  19 female 35.150        0     no northwest  2134.901  non-smoker & age<43
## 463  62 female 38.095        2     no northeast 15230.324 non-smoker & age>=43
## 179  46 female 28.900        2     no southwest  8823.279 non-smoker & age>=43
## 526  18 female 33.880        0     no southeast 11482.635  non-smoker & age<43
## 195  18   male 34.430        0     no southeast  1137.470  non-smoker & age<43
## 938  39 female 24.225        5     no northwest  8965.796  non-smoker & age<43
##                   group2                          group3
## 415 non-smoker & bmi>=30 non-smoker & age<43 & 0children
## 463 non-smoker & bmi>=30            non-smoker & age>=43
## 179  non-smoker & bmi<30            non-smoker & age>=43
## 526 non-smoker & bmi>=30 non-smoker & age<43 & 0children
## 195 non-smoker & bmi>=30 non-smoker & age<43 & 0children
## 938  non-smoker & bmi<30 non-smoker & age<43 & 0children
str(insurance_data)
## 'data.frame':    936 obs. of  10 variables:
##  $ age     : int  19 62 46 18 18 39 41 62 20 24 ...
##  $ sex     : Factor w/ 2 levels "female","male": 1 1 1 1 2 1 1 2 2 2 ...
##  $ bmi     : num  35.1 38.1 28.9 33.9 34.4 ...
##  $ children: int  0 2 2 0 0 5 3 0 0 0 ...
##  $ smoker  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 2 ...
##  $ region  : Factor w/ 4 levels "northeast","northwest",..: 2 1 4 3 3 2 4 3 4 1 ...
##  $ charges : num  2135 15230 8823 11483 1137 ...
##  $ group   : Factor w/ 4 levels "non-smoker & age<43",..: 1 2 2 1 1 1 1 2 3 4 ...
##  $ group2  : Factor w/ 4 levels "non-smoker & bmi<30",..: 2 2 1 2 2 1 2 2 3 4 ...
##  $ group3  : Factor w/ 4 levels "non-smoker & age<43 & 0children",..: 1 2 2 1 1 1 1 2 3 4 ...
# Drop column "B"
drop_variable_for_feature <- c("group", "group2", "group3")
insurance_data <- insurance_data[, !colnames(insurance_data) %in% drop_variable_for_feature]


#######################
## Feature selection ##
#######################

## Lasso feature selection

# Convert factor variables to one-hot encoding
train_data_encoded <- model.matrix(~ . - 1, data = insurance_data)

# Convert data to matrix form
exclude_variable_name <- "charges"  # Replace with the actual variable name
x <- as.matrix(train_data_encoded[, !colnames(train_data_encoded) %in% exclude_variable_name]) # predictors
y <- train_data$charges # response

# Apply Lasso
lasso_model <- glmnet(x, y, alpha = 1) # alpha=1 indicates Lasso

# Cross-validation for lambda selection
cv.lasso <- cv.glmnet(x, y, alpha = 1)
plot(cv.lasso)

optimal_lambda <- cv.lasso$lambda.min

# Coefficients at optimal lambda
coef(lasso_model, s = optimal_lambda)
## 10 x 1 sparse Matrix of class "dgCMatrix"
##                          s1
## (Intercept)     -11706.2235
## age                232.7559
## sexfemale            .     
## sexmale              .     
## bmi                337.8689
## children           572.0103
## smokeryes        23603.4315
## regionnorthwest      .     
## regionsoutheast    -92.2366
## regionsouthwest   -331.1749
## Forward selection

reg.fwd=regsubsets(charges~.,data=insurance_data,method="forward")

summary(reg.fwd)$adjr2
## [1] 0.6261174 0.7104032 0.7412801 0.7454610 0.7456105 0.7457828 0.7457844
## [8] 0.7456120
summary(reg.fwd)$rss
## [1] 52510373307 40629211034 36258409342 35634203991 35575017163 35512689427
## [7] 35474244642 35460055213
summary(reg.fwd)$bic
## [1]  -908.1678 -1141.4321 -1241.1222 -1250.5346 -1245.2489 -1240.0486 -1234.2208
## [8] -1227.7537
par(mfrow=c(1,3))
bics<-summary(reg.fwd)$bic
plot(1:8,bics,type="l",ylab="BIC",xlab="# of predictors")
index<-which(bics==min(bics))
points(index,bics[index],col="red",pch=10)

adjr2<-summary(reg.fwd)$adjr2
plot(1:8,adjr2,type="l",ylab="Adjusted R-squared",xlab="# of predictors")
index<-which(adjr2==max(adjr2))
points(index,adjr2[index],col="red",pch=10)

rss<-summary(reg.fwd)$rss
plot(1:8,rss,type="l",ylab="train RSS",xlab="# of predictors")
index<-which(rss==min(rss))
points(index,rss[index],col="red",pch=10)

coef(reg.fwd,4)
## (Intercept)         age         bmi    children   smokeryes 
## -12811.3853    240.8897    354.1445    667.6838  23925.0815
print("")
## [1] ""
coef(reg.fwd,8)
##     (Intercept)             age         sexmale             bmi        children 
##     -12473.5434        240.0827       -248.8674        369.9369        673.3514 
##       smokeryes regionnorthwest regionsoutheast regionsouthwest 
##      23938.2867       -595.1616       -948.1999      -1116.6608
print("")
## [1] ""
##################################
## Building Interpretable Model ##
##################################

# Creating all the different types of linear models that don't add too much complexity based off of residuals vs fitted as well as normal Q-Q plots assumptions are not met for any model configuration. 
simple_linear_model <- lm(charges ~ age + sex + bmi + children + smoker, data = insurance_data)
par(mfrow=c(2,2))
plot(simple_linear_model)

log_linear_model <- lm(log(charges) ~ age + sex + bmi + children + smoker, data = insurance_data)
par(mfrow=c(2,2))
plot(log_linear_model)

log_log_linear_model <- lm(log(charges) ~ log(age) + sex + log(bmi) + children + smoker, data = insurance_data)
par(mfrow=c(2,2))
plot(log_log_linear_model)

interaction_linear_model <- lm(charges ~ age * smoker + sex + bmi + children, data = insurance_data)
par(mfrow=c(2,2))
plot(interaction_linear_model)

summary(interaction_linear_model)
## 
## Call:
## lm(formula = charges ~ age * smoker + sex + bmi + children, data = insurance_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11428.5  -3062.9   -906.6   1665.3  29891.2 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -12629.35    1197.94 -10.543  < 2e-16 ***
## age              237.27      16.48  14.397  < 2e-16 ***
## smokeryes      23312.70    1480.77  15.744  < 2e-16 ***
## sexmale         -229.43     408.76  -0.561    0.575    
## bmi              356.53      33.33  10.698  < 2e-16 ***
## children         668.71     165.57   4.039 5.82e-05 ***
## age:smokeryes     16.58      36.25   0.458    0.647    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6192 on 929 degrees of freedom
## Multiple R-squared:  0.7467, Adjusted R-squared:  0.7451 
## F-statistic: 456.4 on 6 and 929 DF,  p-value: < 2.2e-16
#######################################
## Simple model for interpratability ##
#######################################

#PAIRED BOOTSTRAP
print("Paired Bootstrap")
## [1] "Paired Bootstrap"
boot.p<-paired.boot(charges ~ age + bmi + children + smoker,
                    B=3000,
                    seed=1234,
                    data = insurance_data)
t(apply(boot.p$bootEstParam,2,quantile,probs=c(.025,.975)))
##                    2.5%       97.5%
## (Intercept) -15184.2133 -10444.0836
## age            213.0127    268.0117
## bmi            285.2497    428.0634
## children       365.1938    973.1746
## smokeryes    22583.3657  25238.2086
simple <- lm(charges ~ age + bmi + children + smoker, data = insurance_data)
summary(simple)
## 
## Call:
## lm(formula = charges ~ age + bmi + children + smoker, data = insurance_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11304.4  -3089.2   -897.7   1721.3  29887.3 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -12811.39    1164.39 -11.003  < 2e-16 ***
## age            240.89      14.67  16.423  < 2e-16 ***
## bmi            354.14      33.13  10.689  < 2e-16 ***
## children       667.68     165.34   4.038 5.83e-05 ***
## smokeryes    23925.08     491.07  48.721  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6187 on 931 degrees of freedom
## Multiple R-squared:  0.7465, Adjusted R-squared:  0.7455 
## F-statistic: 685.6 on 4 and 931 DF,  p-value: < 2.2e-16
###################
## Complex model ##
###################

print("Paired Bootstrap with interaction")
## [1] "Paired Bootstrap with interaction"
boot.p<-paired.boot(charges ~ age:smoker + bmi:smoker + children,
                    B=3000,
                    seed=1234,
                    data = insurance_data)
t(apply(boot.p$bootEstParam,2,quantile,probs=c(.025,.975)))
##                      2.5%      97.5%
## (Intercept)   -8948.30448 -5383.5289
## children        381.20252   889.4906
## age:smokerno    251.67237   302.0907
## age:smokeryes    98.19227   203.7076
## smokerno:bmi     72.00424   176.5724
## smokeryes:bmi  1003.82385  1152.1987
complex <- lm(charges ~ age:smoker + bmi:smoker + children, data = insurance_data)
summary(complex)
## 
## Call:
## lm(formula = charges ~ age:smoker + bmi:smoker + children, data = insurance_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10610.4  -1985.7  -1032.1     93.2  30199.7 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -7135.39     963.51  -7.406 2.92e-13 ***
## children        629.39     137.57   4.575 5.40e-06 ***
## age:smokerno    277.25      13.49  20.551  < 2e-16 ***
## age:smokeryes   151.31      24.04   6.295 4.74e-10 ***
## smokerno:bmi    121.10      29.07   4.166 3.38e-05 ***
## smokeryes:bmi  1077.72      37.37  28.841  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5147 on 930 degrees of freedom
## Multiple R-squared:  0.8247, Adjusted R-squared:  0.8238 
## F-statistic: 875.3 on 5 and 930 DF,  p-value: < 2.2e-16
######################
## Evaluation Model ##
######################

evaluate_model <- function(model, test_data) {
  
  # 1. Predict on test data
  test_predictions <- predict(model, newdata = test_data)
  
  # 2. Calculate MSE
  actual_values <- test_data$charges  # replace 'charges' if your response variable name differs
  errors <- test_predictions - actual_values
  MSE <- mean(errors^2)
  
  # 3. R squared and Adjusted R squared
  SST <- sum((actual_values - mean(actual_values))^2)
  SSR <- sum(errors^2)
  R2 <- 1 - (SSR / SST)
  
  # Adjusted R^2
  n <- length(actual_values)
  p <- length(coefficients(model)) - 1  # minus one to exclude the intercept
  Adj_R2 <- 1 - ((1 - R2) * (n - 1) / (n - p - 1))
  
  # 4. AIC and BIC
  AIC_val <- AIC(model, k = 2)  # k=2 by default for linear regression
  BIC_val <- BIC(model)
  
  # Return the metrics
  metrics <- list(
    MSE = MSE,
    R2 = R2,
    Adj_R2 = Adj_R2,
    AIC = AIC_val,
    BIC = BIC_val
  )
  
  return(metrics)
}


metrics.simple <- evaluate_model(simple, test_data)
metrics.complex <- evaluate_model(complex, test_data)
print("Simple")
## [1] "Simple"
metrics.simple
## $MSE
## [1] 33979472
## 
## $R2
## [1] 0.7529387
## 
## $Adj_R2
## [1] 0.7504495
## 
## $AIC
## [1] 19006.09
## 
## $BIC
## [1] 19035.14
print("Complex")
## [1] "Complex"
metrics.complex
## $MSE
## [1] 24590140
## 
## $R2
## [1] 0.8212076
## 
## $Adj_R2
## [1] 0.8189501
## 
## $AIC
## [1] 18662.81
## 
## $BIC
## [1] 18696.7

KNN

train_data2 = insurance_data
validation_data2 = test_data

# train_data2$IsSouth = NA
# train_data2$IsSouth = ifelse(grepl("south", train_data1$region, ignore.case = TRUE), 1, 0)
train_data2$smoker = ifelse(grepl("yes", insurance_data$smoker, ignore.case = TRUE), 1, 0)

# validation_data2$IsSouth = NA
# validation_data2$IsSouth = ifelse(grepl("south", validation_data1$region, ignore.case = TRUE), 1, 0)
validation_data2$smoker = ifelse(grepl("yes", test_data$smoker, ignore.case = TRUE), 1, 0)


tuneGrid <- expand.grid(k = c(1:10, 20, 30))


fitControl <- trainControl(method = "cv", number = 10)

knn_fit <- train(charges ~ age + bmi + children + smoker, data = train_data2, method = "knn",
                 trControl = fitControl, tuneGrid=tuneGrid)

print(knn_fit)
## k-Nearest Neighbors 
## 
## 936 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 842, 842, 844, 844, 842, 840, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    1  12711.35  0.2066316  6796.781
##    2  11696.10  0.1874317  7301.533
##    3  11507.25  0.1817744  7572.681
##    4  11398.63  0.1742334  7725.796
##    5  11466.58  0.1576801  7971.900
##    6  11479.73  0.1492202  8163.695
##    7  11464.86  0.1476428  8239.592
##    8  11526.00  0.1342101  8327.375
##    9  11561.01  0.1247275  8399.734
##   10  11551.57  0.1238090  8481.278
##   20  11633.05  0.1068564  8970.151
##   30  11649.47  0.1021759  9086.045
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 4.
# from the result, we see that K = 6 is the best in terms of RMSE
predictors = c("age", "bmi", "children", "smoker")
response = "charges"

trainx = train_data2[, predictors]
trainy = train_data2$charges

testx = validation_data2[, predictors]
testy = validation_data2$charges

knn_model <- knnreg(trainx, trainy, k = 9)

plot(testy, predict(knn_model, testx))

test_result = predict(knn_model, testx)

MSE = mean(test_result - testy)^2

print("MSE")
## [1] "MSE"
print(MSE)
## [1] 75043.42